from PyCmpltrtok.common import sep
from datasets import Dataset, load_from_disk
import pandas as pd

# path = r'D:\_const\large_code\from-Dr-Sun-named_entity_recognition-master\ResumeNER\train_small.char-limit-300.hf'
# path = '/home/yunpeng/datasets/cner/dev_small.char-limit-50.hf'
path = '/home/yunpeng/datasets/cner/train.char.hf'

ds = load_from_disk(path)
print(ds)

tokens = ds['tokens']
ner_tags = ds['ner_tags']

for i, this_tokens in enumerate(tokens):
    sep(i)
    if i > 5:
        break
    this_ner_tags = ner_tags[i]
    for j, token in enumerate(this_tokens):
        tag = this_ner_tags[j]
        print(token, tag)

tags = []
for this_ner_tags in ner_tags:
    for tag in this_ner_tags:
        tags.append(tag)
s = pd.Series(tags)
counts = s.value_counts()
print(counts)
print(sorted(counts.index))
